import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay
from xgboost import plot_importance
import datetime
import seaborn as sns
from sklearn.cluster import KMeans
import plotly.express as px
from scipy.stats import kruskal
from scipy.stats import chi2_contingency
import multiprocessing
from imblearn.over_sampling import SMOTE
from collections import Counter
sns.set_style("darkgrid")
sns.set_context("talk")
np.random.seed(42)
df = pd.read_csv('data/weatherAUS.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145460 entries, 0 to 145459 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 145460 non-null object 1 Location 145460 non-null object 2 MinTemp 143975 non-null float64 3 MaxTemp 144199 non-null float64 4 Rainfall 142199 non-null float64 5 Evaporation 82670 non-null float64 6 Sunshine 75625 non-null float64 7 WindGustDir 135134 non-null object 8 WindGustSpeed 135197 non-null float64 9 WindDir9am 134894 non-null object 10 WindDir3pm 141232 non-null object 11 WindSpeed9am 143693 non-null float64 12 WindSpeed3pm 142398 non-null float64 13 Humidity9am 142806 non-null float64 14 Humidity3pm 140953 non-null float64 15 Pressure9am 130395 non-null float64 16 Pressure3pm 130432 non-null float64 17 Cloud9am 89572 non-null float64 18 Cloud3pm 86102 non-null float64 19 Temp9am 143693 non-null float64 20 Temp3pm 141851 non-null float64 21 RainToday 142199 non-null object 22 RainTomorrow 142193 non-null object dtypes: float64(16), object(7) memory usage: 25.5+ MB
df.head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
5 rows × 23 columns
df_2 = pd.read_csv('data/IDCJAC0010_086282_1800_Data.csv')
df_2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18982 entries, 0 to 18981 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Product code 18982 non-null object 1 Bureau of Meteorology station number 18982 non-null int64 2 Year 18982 non-null int64 3 Month 18982 non-null int64 4 Day 18982 non-null int64 5 Maximum temperature (Degree C) 18800 non-null float64 6 Days of accumulation of maximum temperature 18799 non-null float64 7 Quality 18799 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 1.2+ MB
df_2.head()
| Product code | Bureau of Meteorology station number | Year | Month | Day | Maximum temperature (Degree C) | Days of accumulation of maximum temperature | Quality | |
|---|---|---|---|---|---|---|---|---|
| 0 | IDCJAC0010 | 86282 | 1970 | 1 | 1 | NaN | NaN | NaN |
| 1 | IDCJAC0010 | 86282 | 1970 | 1 | 2 | NaN | NaN | NaN |
| 2 | IDCJAC0010 | 86282 | 1970 | 1 | 3 | NaN | NaN | NaN |
| 3 | IDCJAC0010 | 86282 | 1970 | 1 | 4 | NaN | NaN | NaN |
| 4 | IDCJAC0010 | 86282 | 1970 | 1 | 5 | NaN | NaN | NaN |
latlong=pd.read_csv('data/au.csv')
latlong.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 543 entries, 0 to 542 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 city 543 non-null object 1 lat 543 non-null float64 2 lng 543 non-null float64 3 country 543 non-null object 4 iso2 543 non-null object 5 admin_name 543 non-null object 6 capital 9 non-null object 7 population 543 non-null int64 8 population_proper 543 non-null int64 dtypes: float64(2), int64(2), object(5) memory usage: 38.3+ KB
latlong.head()
| city | lat | lng | country | iso2 | admin_name | capital | population | population_proper | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Sydney | -33.8650 | 151.2094 | Australia | AU | New South Wales | admin | 5312163 | 4840600 |
| 1 | Melbourne | -37.8136 | 144.9631 | Australia | AU | Victoria | admin | 5078193 | 4529500 |
| 2 | Brisbane | -27.4678 | 153.0281 | Australia | AU | Queensland | admin | 2514184 | 2360241 |
| 3 | Perth | -31.9522 | 115.8589 | Australia | AU | Western Australia | admin | 2059484 | 2039200 |
| 4 | Adelaide | -34.9289 | 138.6011 | Australia | AU | South Australia | admin | 1345777 | 1295714 |
Dataset 1 (df): 10 years of daily weather observations from different locations in Australia
Date: date YYYY-MM-DDLocation: location of weather stationMinTemp: minimum temperature in the 24 hours to 9am (degrees Celsius) - numericalMaxTemp: maximum temperaure in the 24 hours from 9am (degrees Celsius) - numericalRainfall: precipitation (rainfall) in the 24 hours to 9am (millimetres) - numericalEvaporation: class A pan evaporation in the 24 hours to 9am (millimetres) - numericalSunshine: bright sunshine in the 24 hours to midnight (hours) - numericalWindGustDir: direction of strongest gust in the 24 hours to midnight (16 compass points) - categoricalWindGustSpeed: speed of strongest wind gust in the 24 hours to midnight (kilometres per hour) - numericalWindDir9am: wind direction averaged over 10 minutes prior to 9am (16 compass points) - categoricalWindDir3pm: wind direction averaged over 10 minutes prior to 3pm (16 compass points)- categoricalWindSpeed9am: wind speed averaged over 10 minutes prior to 9am (kilometres per hour) - numericalWindSpeed3pm: wind speed averaged over 10 minutes prior to 3pm (kilometres per hour) - numericalHumidity9am: relative humidity at 9am (percent) - numericalHumidity3pm: relative humidity at 3pm (percent) - numericalPressure9am: atmospheric pressure reduced to mean sea level at 9am (hectopascals) - numericalPressure3pm: atmospheric pressure reduced to mean sea level at 3pm (hectopascals) - numericalCloud9am: fraction of sky obscured by cloud at 9am (heights) - numericalCloud3pm: fraction of sky obscured by cloud at 3pm (heights) - numericalTemp9am: temperature at 9am (degrees Celsius) - numericalTemp3pm: temperature at 3pm (degrees Celsius) - numericalRainToday: whether it rains on the given day (Yes/No) - categoricalRainTomorrow: whether it rains on the following day (Yes/No) - categoricalData: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package \ Data source: http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data \ Definitions: http://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml
Dataset 2 (df_2): Daily maximum temperature (degrees Celsius) in Melbourne Airport weather station from 1970 to 2021 \ Station Details
Dataset 3 (latlong): Dataset containing 543 prominent cities in Australia. Each row includes a city's latitude, longitude, state and other variables of interest.
city: The name of the citylat: The latitude of the citylng: The longitude of the citycountry: The country to which the city belongs (Australia)iso2: The alpha-2 iso code of the countryadmin_name: The name of the highest level administration region of the city towncapital: Blank string if not a capital, otherwise: primary - country's capital, admin - first-level admin capital, minor - lower-level admin capitalpopulation: An estimate of the city's urban populationpopulation_proper: An estimate of the city's municipal population. df['Date'].value_counts()
2013-11-12 49
2014-09-01 49
2014-08-23 49
2014-08-24 49
2014-08-25 49
..
2007-11-29 1
2007-11-28 1
2007-11-27 1
2007-11-26 1
2008-01-31 1
Name: Date, Length: 3436, dtype: int64
df[df['Date'] == '2013-11-12'].head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1718 | 2013-11-12 | Albury | 12.1 | 18.6 | 4.4 | NaN | NaN | W | 46.0 | SSE | ... | 75.0 | 65.0 | 1015.0 | 1011.4 | 8.0 | 8.0 | 14.3 | 17.1 | Yes | No |
| 4727 | 2013-11-12 | BadgerysCreek | 13.7 | 22.8 | 31.0 | NaN | NaN | NNE | 26.0 | S | ... | 86.0 | 73.0 | 1012.7 | 1010.3 | NaN | NaN | 17.2 | 19.4 | Yes | Yes |
| 7736 | 2013-11-12 | Cobar | 10.0 | 25.2 | 0.0 | 8.4 | NaN | WSW | 44.0 | SW | ... | 63.0 | 21.0 | 1014.2 | 1012.4 | 1.0 | 2.0 | 15.2 | 23.4 | No | No |
| 10745 | 2013-11-12 | CoffsHarbour | 17.9 | 26.4 | 59.6 | NaN | 5.7 | NNE | 50.0 | NNW | ... | 78.0 | 65.0 | 1015.8 | 1012.5 | 7.0 | 8.0 | 22.0 | 25.7 | Yes | Yes |
| 13754 | 2013-11-12 | Moree | 17.5 | 31.1 | NaN | NaN | NaN | W | 46.0 | N | ... | 83.0 | 42.0 | 1012.6 | 1010.5 | 6.0 | 6.0 | 20.1 | 26.3 | NaN | NaN |
5 rows × 23 columns
Date column all filled correctly
df['Location'].isnull().sum()
0
df['Location'].value_counts()
Canberra 3436 Sydney 3344 Darwin 3193 Melbourne 3193 Brisbane 3193 Adelaide 3193 Perth 3193 Hobart 3193 Albany 3040 MountGambier 3040 Ballarat 3040 Townsville 3040 GoldCoast 3040 Cairns 3040 Launceston 3040 AliceSprings 3040 Bendigo 3040 Albury 3040 MountGinini 3040 Wollongong 3040 Newcastle 3039 Tuggeranong 3039 Penrith 3039 Woomera 3009 Nuriootpa 3009 Cobar 3009 CoffsHarbour 3009 Moree 3009 Sale 3009 PerthAirport 3009 PearceRAAF 3009 Witchcliffe 3009 BadgerysCreek 3009 Mildura 3009 NorfolkIsland 3009 MelbourneAirport 3009 Richmond 3009 SydneyAirport 3009 WaggaWagga 3009 Williamtown 3009 Dartmoor 3009 Watsonia 3009 Portland 3009 Walpole 3006 NorahHead 3004 SalmonGums 3001 Katherine 1578 Nhil 1578 Uluru 1578 Name: Location, dtype: int64
Location column filled correctly
df['Evaporation'].isnull().sum() / len(df)
0.43166506256015397
df['Evaporation'].hist()
<AxesSubplot:>
len(df['Evaporation'][df['Evaporation'] < 15])/len(df)
0.5554997937577341
len(df['Evaporation'][(~df['Evaporation'].isnull()) & (df['Evaporation'] < 15)]) / len(df['Evaporation'][~df['Evaporation'].isnull()])
0.9774162332164026
Evaporation has many values missing (~43%) and most <15 (~97%); choice: linear interpolation then drop the ones that could not be interpolated.
df['Evaporation'].interpolate(method='linear', inplace=True)
df['Evaporation'].isnull().sum()
6049
df['Evaporation'].isnull().sum() / len(df)
0.04158531555066685
df['Evaporation'].hist()
<AxesSubplot:>
df.dropna(subset=['Evaporation'], inplace=True)
df = df.reset_index(drop=True)
len(df)
139411
df['Sunshine'].isnull().sum() / len(df)
0.4575392185695533
df['Sunshine'].hist()
<AxesSubplot:>
Sunshine has more of a normal distribution but almost half (48%!!) of entries missing; choice: drop from analysis.
df.drop(['Sunshine'], axis=1, inplace=True)
df['Cloud9am'].isnull().sum() / len(df)
0.36674293994017687
df['Cloud3pm'].isnull().sum() / len(df)
0.3926232506760586
df['Cloud9am'].hist(label = 'Cloud9am')
df['Cloud3pm'].hist(label = 'Cloud3pm')
plt.legend()
plt.show()
Bimodal distributions for Cloud9am and Cloud3pm: Cloud9am is 37% NAN and Cloud3pm is 39% NAN. Choice: fill NAN entries by linear interpolation
df['Cloud3pm'].interpolate(method='linear', inplace=True)
df['Cloud9am'].interpolate(method='linear', inplace=True)
df['Cloud9am'].isnull().sum()
0
df['Cloud3pm'].isnull().sum()
0
df['Cloud9am'].hist(label = 'Cloud9am')
df['Cloud3pm'].hist(label = 'Cloud3pm')
plt.legend()
plt.show()
df['Rainfall'].isnull().sum() / len(df)
0.022602233683138347
df['Rainfall'].hist(bins=80)
<AxesSubplot:>
len(df['Rainfall'][df['Rainfall'] < 1]) / len(df)
0.7451994462416882
len(df['Rainfall'][(~df['Rainfall'].isnull()) & (df['Rainfall'] < 1)]) / len(df['Rainfall'][~df['Rainfall'].isnull()])
0.762432115074123
Rainfall almost always <1. Choice: fill NAN entries with mean.
df['Rainfall'].fillna(value=df['Rainfall'].mean(), inplace=True)
df['Rainfall'].hist(bins=80)
<AxesSubplot:>
df['WindGustSpeed'].isnull().sum() /len(df)
0.07284217170811486
df['WindGustSpeed'].hist()
<AxesSubplot:>
Also choosing mean for missing (7%) for wind gust speeds as that looked normal.
df['WindGustSpeed'].fillna(value=df['WindGustSpeed'].mean(), inplace=True)
df['WindGustSpeed'].hist()
<AxesSubplot:>
df['MinTemp'].isnull().sum() / len(df)
0.010314824511695634
df['MaxTemp'].isnull().sum() / len(df)
0.00875827588927703
df['Humidity9am'].isnull().sum() / len(df)
0.018585334012380657
df['Humidity3pm'].isnull().sum() / len(df)
0.03187696810151279
df['Temp9am'].isnull().sum() / len(df)
0.012316101311948125
df['Temp3pm'].isnull().sum() / len(df)
0.02551448594443767
These columns had only about 1-3% NAN entries overall. Choice: drop NAN rows for acceptable data loss (~4% rows lost from 139411 to 133810 entries)
len(df)
139411
df.dropna(subset=['MinTemp', 'MaxTemp', 'Humidity9am', 'Humidity3pm', 'Temp9am', 'Temp3pm'], inplace=True)
df = df.reset_index(drop=True)
len(df)
133810
df['WindGustDir'].value_counts()
W 9042 SE 8788 N 8714 S 8712 SSE 8682 E 8455 WSW 8388 SSW 8381 SW 8360 NW 7692 WNW 7557 ENE 7510 ESE 6821 NE 6740 NNW 6294 NNE 6043 Name: WindGustDir, dtype: int64
df['WindDir9am'].value_counts()
N 10982 E 8687 SSE 8516 SE 8370 S 8152 NW 8126 W 7897 NNE 7678 NNW 7563 SW 7460 ENE 7435 NE 7210 SSW 7111 ESE 7080 WNW 6976 WSW 6452 Name: WindDir9am, dtype: int64
df['WindDir3pm'].value_counts()
SE 10112 S 9360 W 9301 SW 8937 WSW 8882 SSE 8846 N 8299 WNW 8183 NW 8114 ESE 7840 E 7813 SSW 7782 NE 7746 NNW 7488 ENE 7346 NNE 5984 Name: WindDir3pm, dtype: int64
df['WindGustDir'].isnull().sum() / len(df)
0.05702862267394066
df['WindDir9am'].isnull().sum() / len(df)
0.060645691652342876
df['WindDir3pm'].isnull().sum() / len(df)
0.013280023914505642
Wind directions are categorical and vary with other features (can check correlations) but these only have around ~7% of their entries missing. It is tricky to decide on filling NAN with mode or means as this may not make much sense; choice: drop rows of NAN. Now have another ~11% rows lost but 119k+ remain (from 133810 to 119409)
df.dropna(subset=['WindGustDir', 'WindDir9am', 'WindDir3pm'], inplace=True)
df = df.reset_index(drop=True)
len(df)
119409
df['Pressure9am'].isnull().sum() / len(df)
0.07350367225251028
df['Pressure3pm'].isnull().sum() / len(df)
0.0731016925022402
df['Pressure9am'].hist(label = 'Pressure9am')
df['Pressure3pm'].hist(label = 'Pressure3pm')
plt.legend()
plt.show()
Pressure distributions are approx normal and mean is used to fill their NAN entries.
df['Pressure9am'].fillna(value=df['Pressure9am'].mean(), inplace=True)
df['Pressure3pm'].fillna(value=df['Pressure3pm'].mean(), inplace=True)
df['Pressure9am'].hist(label = 'Pressure9am')
df['Pressure3pm'].hist(label = 'Pressure3pm')
plt.legend()
plt.show()
df['RainToday'].isnull().sum() / len(df)
0.014571765947290405
df['RainTomorrow'].isnull().sum() / len(df)
0.013617064040398965
df['RainToday'].value_counts() / len(df[~df['RainToday'].isnull()])
No 0.775387 Yes 0.224613 Name: RainToday, dtype: float64
df['RainTomorrow'].value_counts() / len(df[~df['RainTomorrow'].isnull()])
No 0.776182 Yes 0.223818 Name: RainTomorrow, dtype: float64
RainToday and RainTomorrow have 77% No and 22% Yes; Choice: since this are the target variables we just drop the missing values row-wise ($\sim$2%)
df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)
df = df.reset_index(drop=True)
# Check NAN status
print(df.isna().any())
print(df.isnull().sum().sum())
print(len(df))
Date False Location False MinTemp False MaxTemp False Rainfall False Evaporation False WindGustDir False WindGustSpeed False WindDir9am False WindDir3pm False WindSpeed9am False WindSpeed3pm False Humidity9am False Humidity3pm False Pressure9am False Pressure3pm False Cloud9am False Cloud3pm False Temp9am False Temp3pm False RainToday False RainTomorrow False dtype: bool 0 116908
df_2 = df_2[['Year', 'Month', 'Day', 'Maximum temperature (Degree C)']].rename(columns={'Maximum temperature (Degree C)': 'MaxTemp'})
df_2 = df_2[~df_2['MaxTemp'].isnull()]
latlong['city'] = latlong['city'].str.replace(" ","")
df.loc[df['Location'] == "SydneyAirport", "Location"] = "Sydney"
df.loc[df['Location'] == "MelbourneAirport", "Location"] = "Melbourne"
df.loc[df['Location'] == "PerthAirport", "Location"] = "Perth"
loc = df['Location'].unique()
ct = latlong[latlong['city'].isin(loc)][:-1]
nl = loc[~np.in1d(loc,ct['city'])]
print(len(nl),len(ct),len(loc))
15 27 42
latlong_nl=np.array([[-33.2833,151.5661],
[-29.0408,167.9547],
[-32.8080,151.8510],
[-35.4244,149.0888],
[-35.5333,148.7833],
[-36.3333,141.6500],
[-37.7167,145.0833],
[-37.9333,141.2833],
[-31.1999,136.8167],
[-34.0282,115.1008],
[-31.6677,116.0090],
[-32.9747,121.6385],
[-34.9550,116.7696],
[-14.4652,132.2635],
[-25.3073,131.0178]])
nl = nl.reshape(15, 1)
nl_df = pd.DataFrame(np.concatenate((nl, latlong_nl), axis=1), columns=['Location', 'lat', 'lng'])
latlong_df = pd.concat([nl_df, ct[['city', 'lat', 'lng']].rename(columns={'city': 'Location'})])
latlong_df = latlong_df.reset_index(drop=True)
df=pd.merge(df,latlong_df,how='left',on='Location',validate='many_to_one')
df['RainToday'].replace({'No': 0, 'Yes': 1}, inplace=True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1}, inplace=True)
sns.set(rc={'figure.figsize':(8,6)})
sns.heatmap(df.corr(), mask=np.triu(df.corr()))
<AxesSubplot:>
#plot counts RainTomorrow
fig = sns.catplot(x='RainTomorrow', kind='count', data=df, height=5, aspect=2)
fig.set(title='Histogram for RainTomorrow')
<seaborn.axisgrid.FacetGrid at 0x14b212640>
From the plot we can see that there's a great imbalance between the two classes; observing rain is quite anomalous in large parts of Australia, after all. It is then incumbent on us to avoid metrics such as accuracy when evaluating the predictive models' performances largely due to the fact that deterministic predictors (classifying the datapoints as 0 with probability 1) will be very high. An alternative to accuracy would then be the average precision recall since this metric takes into account both positive and negative classes. Simply put, precision is the ratio of true positives to total predicted positives: we want almost all our forecasted days of rain to be be days where this anomaly was observed.
# Map Tokens
f= open('data/mapbox_token','w+')
f.write('pk.eyJ1Ijoic2h0cmF1c3NhcnQiLCJhIjoiY2t0eXhzdGFpMWlscTJ1cDg3ZGhocmptayJ9.7TepS9eN6iKXwYjPHu44Tg')
f.close()
map_token = 'pk.eyJ1Ijoic2h0cmF1c3NhcnQiLCJhIjoiY2t0eXhzdGFpMWlscTJ1cDg3ZGhocmptayJ9.7TepS9eN6iKXwYjPHu44Tg'
df.loc[df['MaxTemp'] <0 , 'MaxTemp'] = 0
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop('Date', axis=1, inplace = True)
px.set_mapbox_access_token(open('data/mapbox_token').read())
df_avg = df.groupby(['Location', 'Year'])['MaxTemp'].mean().reset_index().rename(columns={'MaxTemp': 'AvgMaxTemp'})
df_avg = pd.merge(df_avg, latlong_df, how='left', on='Location', validate='many_to_one')
df_avg.drop(df_avg[df_avg['Year'] == 2007].index, inplace=True)
fig = px.scatter_mapbox(df_avg, lat="lat", lon="lng", color='AvgMaxTemp', mapbox_style='light',
opacity=1.0, range_color=[df_avg['AvgMaxTemp'].min(), df_avg['AvgMaxTemp'].max()],
size='AvgMaxTemp', size_max=10,
animation_frame='Year',
zoom=2.8, center=dict(lat=-27, lon=140), height=650)
fig.update_layout(margin=dict(l=30, r=30, t=60, b=30))
fig.update_layout(template='plotly_white', showlegend=True)
fig.update_layout(font=dict(family='sans-serif', size=12));
fig.show()
df_2['Year'].value_counts().sort_index()
1970 184 1971 365 1972 365 1973 365 1974 365 1975 365 1976 366 1977 365 1978 365 1979 365 1980 366 1981 365 1982 365 1983 365 1984 366 1985 365 1986 365 1987 365 1988 366 1989 365 1990 365 1991 365 1992 366 1993 365 1994 365 1995 365 1996 366 1997 365 1998 365 1999 365 2000 366 2001 365 2002 365 2003 365 2004 366 2005 365 2006 365 2007 365 2008 366 2009 365 2010 365 2011 365 2012 366 2013 365 2014 365 2015 365 2016 366 2017 365 2018 365 2019 365 2020 366 2021 354 Name: Year, dtype: int64
df_2 = df_2[df_2['Year'] != 1970].reset_index()
year_avg_temps = df_2.groupby(['Year'])['MaxTemp'].mean().reset_index().rename(columns={'MaxTemp': 'AvgTemp'})
year_avg_temps.head(10)
| Year | AvgTemp | |
|---|---|---|
| 0 | 1971 | 19.447397 |
| 1 | 1972 | 20.093699 |
| 2 | 1973 | 19.277260 |
| 3 | 1974 | 19.235068 |
| 4 | 1975 | 19.515616 |
| 5 | 1976 | 19.421858 |
| 6 | 1977 | 19.385479 |
| 7 | 1978 | 18.781644 |
| 8 | 1979 | 19.812603 |
| 9 | 1980 | 20.118033 |
year_avg_temps['Period'] = year_avg_temps['Year'].apply(lambda x: '1971 - 1999' if x < 2000 else '2000 - 2021')
year_avg_temps
| Year | AvgTemp | Period | |
|---|---|---|---|
| 0 | 1971 | 19.447397 | 1971 - 1999 |
| 1 | 1972 | 20.093699 | 1971 - 1999 |
| 2 | 1973 | 19.277260 | 1971 - 1999 |
| 3 | 1974 | 19.235068 | 1971 - 1999 |
| 4 | 1975 | 19.515616 | 1971 - 1999 |
| 5 | 1976 | 19.421858 | 1971 - 1999 |
| 6 | 1977 | 19.385479 | 1971 - 1999 |
| 7 | 1978 | 18.781644 | 1971 - 1999 |
| 8 | 1979 | 19.812603 | 1971 - 1999 |
| 9 | 1980 | 20.118033 | 1971 - 1999 |
| 10 | 1981 | 20.079178 | 1971 - 1999 |
| 11 | 1982 | 20.548493 | 1971 - 1999 |
| 12 | 1983 | 19.066849 | 1971 - 1999 |
| 13 | 1984 | 19.181421 | 1971 - 1999 |
| 14 | 1985 | 19.308767 | 1971 - 1999 |
| 15 | 1986 | 18.710685 | 1971 - 1999 |
| 16 | 1987 | 19.168767 | 1971 - 1999 |
| 17 | 1988 | 20.310383 | 1971 - 1999 |
| 18 | 1989 | 19.321096 | 1971 - 1999 |
| 19 | 1990 | 19.831507 | 1971 - 1999 |
| 20 | 1991 | 19.480822 | 1971 - 1999 |
| 21 | 1992 | 18.580055 | 1971 - 1999 |
| 22 | 1993 | 19.504110 | 1971 - 1999 |
| 23 | 1994 | 19.627671 | 1971 - 1999 |
| 24 | 1995 | 18.416164 | 1971 - 1999 |
| 25 | 1996 | 18.615847 | 1971 - 1999 |
| 26 | 1997 | 19.998630 | 1971 - 1999 |
| 27 | 1998 | 19.673425 | 1971 - 1999 |
| 28 | 1999 | 20.230137 | 1971 - 1999 |
| 29 | 2000 | 20.184699 | 2000 - 2021 |
| 30 | 2001 | 19.995616 | 2000 - 2021 |
| 31 | 2002 | 20.344384 | 2000 - 2021 |
| 32 | 2003 | 20.031507 | 2000 - 2021 |
| 33 | 2004 | 19.590437 | 2000 - 2021 |
| 34 | 2005 | 20.611233 | 2000 - 2021 |
| 35 | 2006 | 20.237808 | 2000 - 2021 |
| 36 | 2007 | 21.213973 | 2000 - 2021 |
| 37 | 2008 | 20.209836 | 2000 - 2021 |
| 38 | 2009 | 20.918356 | 2000 - 2021 |
| 39 | 2010 | 19.954795 | 2000 - 2021 |
| 40 | 2011 | 19.841096 | 2000 - 2021 |
| 41 | 2012 | 20.158743 | 2000 - 2021 |
| 42 | 2013 | 20.812877 | 2000 - 2021 |
| 43 | 2014 | 21.115890 | 2000 - 2021 |
| 44 | 2015 | 20.828219 | 2000 - 2021 |
| 45 | 2016 | 20.434973 | 2000 - 2021 |
| 46 | 2017 | 21.066027 | 2000 - 2021 |
| 47 | 2018 | 21.019452 | 2000 - 2021 |
| 48 | 2019 | 21.259452 | 2000 - 2021 |
| 49 | 2020 | 19.746448 | 2000 - 2021 |
| 50 | 2021 | 19.602542 | 2000 - 2021 |
fig = plt.figure(figsize=(7, 7))
sns.set_style("darkgrid")
sns.set_context("talk")
sns.scatterplot(x=year_avg_temps['Year'], y=year_avg_temps['AvgTemp'], hue=year_avg_temps['Period'])
plt.title('Scatterplot of yearly average temperatures per year')
Text(0.5, 1.0, 'Scatterplot of yearly average temperatures per year')
fig = plt.figure(figsize=(7, 7))
sns.set_style("darkgrid")
sns.set_context("talk")
sns.boxplot(x=year_avg_temps['Period'], y=year_avg_temps['AvgTemp'])
plt.title('Boxplot of yearly average temperatures before and after 2000')
Text(0.5, 1.0, 'Boxplot of yearly average temperatures before and after 2000')
temps_after_2000 = np.array(year_avg_temps[year_avg_temps['Period'] == '2000 - 2021']['AvgTemp'])
temps_before_2000 = np.array(year_avg_temps[year_avg_temps['Period'] == '1971 - 1999']['AvgTemp'])
test_statistic = np.mean(temps_after_2000) - np.mean(temps_before_2000)
def drawing_with_repl(data_1, data_2):
tot_data = np.concatenate((data_1, data_2))
shuffled_tot_data = np.random.permutation(tot_data)
sample_1 = shuffled_tot_data[:len(data_1)]
sample_2 = shuffled_tot_data[len(data_1):]
return sample_1, sample_2
sample_size = 10000000
shuffled_test_statistics = np.empty(sample_size)
for i in range(sample_size):
sample_1, sample_2 = drawing_with_repl(temps_after_2000, temps_before_2000)
shuffled_test_statistics[i] = np.mean(sample_1) - np.mean(sample_2)
fig = plt.figure(figsize=(7, 7))
sns.set_style("darkgrid")
sns.set_context("talk")
sns.histplot(shuffled_test_statistics, bins=50, kde=True)
sns.scatterplot(x=[test_statistic], y=[0], label='empirical test statistic')
plt.ylim([-15000, None])
plt.annotate(f'{np.round(test_statistic, 2)}', (test_statistic, 0), xytext=(test_statistic - 0.1, 15000))
plt.legend(loc='upper right', fontsize=12)
plt.title('Histogram of simulated test statistics')
Text(0.5, 1.0, 'Histogram of simulated test statistics')
p_value = np.sum(shuffled_test_statistics >= test_statistic) / len(shuffled_test_statistics)
p_value
1e-07
def elbow_rule(X, n1=1, n2=15, error=25):
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
for k in range(n1, n2)]
inertias = [model.inertia_ for model in kmeans_per_k]
plt.figure(figsize=(8, 3.5))
plt.plot(range(n1, n2), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
for i in range(len(inertias) - 1):
d = inertias[i] - inertias[i + 1]
opt = i
if d <= error:
opt = i + 1
break
plt.annotate('Elbow',
xy=(opt, inertias[opt - 2]),
xytext=(0.55, 0.55), textcoords='figure fraction', fontsize=16, arrowprops=dict(facecolor='black',
shrink=0.1))
# plt.axis([opt-2,opt+2, 0, inertias[opt-3]])
plt.show()
elbow_rule(df[['lat','lng']],2,8)
k_means= KMeans(n_clusters=4,random_state=42)
k_means_latlng=k_means.fit(df[['lat','lng']])
centroids = k_means_latlng.cluster_centers_
label = k_means_latlng.labels_
u_labels = np.unique(label)
for i in u_labels:
plt.scatter(df['lng'][label == i], df['lat'][label == i], label=i)
plt.legend()
plt.show()
df = df.assign(loc=label)
df['loc'].replace({0: 'North', 1: 'South', 2:'West', 3:'East'}, inplace=True)
df.drop(columns='Location',inplace=True)
df.rename(columns={'loc':'Location'},inplace=True)
Hypothesis Testing: Is there a significant difference in frequency of precipitation between the four locations we just defined?
kruskal(df[df['Location'] == 'South']['RainToday'].to_numpy(), df[df['Location'] == 'East']['RainToday'].to_numpy(),
df[df['Location'] == 'West']['RainToday'].to_numpy(), df[df['Location'] == 'North']['RainToday'].to_numpy())
KruskalResult(statistic=150.266360768042, pvalue=2.3083791853225876e-32)
lbls = ['North', 'South', 'West', 'East']
Given the p-value we can see that there's a difference between the frequency of precipitation in the four locations. We can investigate it more
for i in range(4):
for j in range(i + 1, 4):
p_v = kruskal(df[df['Location'] == lbls[i]]['RainToday'].to_numpy(),
df[df['Location'] == lbls[j]]['RainToday'].to_numpy())[1]
print(f'{lbls[i]} vs {lbls[j]}: p-value {p_v}')
North vs South: p-value 4.3980427178305245e-10 North vs West: p-value 2.5180361672318095e-08 North vs East: p-value 2.742382156876936e-30 South vs West: p-value 0.6841814051461885 South vs East: p-value 9.710565446281005e-16 West vs East: p-value 6.029904801415153e-08
df.head()
| MinTemp | MaxTemp | Rainfall | Evaporation | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | WindSpeed3pm | ... | Temp9am | Temp3pm | RainToday | RainTomorrow | lat | lng | Year | Month | Day | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17.9 | 35.2 | 0.0 | 12.0 | SSW | 48.0 | ENE | SW | 6.0 | 20.0 | ... | 26.6 | 33.4 | 0 | 0 | -31.4997 | 145.8319 | 2009 | 1 | 1 | South |
| 1 | 18.4 | 28.9 | 0.0 | 14.8 | S | 37.0 | SSE | SSE | 19.0 | 19.0 | ... | 20.3 | 27.0 | 0 | 0 | -31.4997 | 145.8319 | 2009 | 1 | 2 | South |
| 2 | 19.4 | 37.6 | 0.0 | 10.8 | NNE | 46.0 | NNE | NNW | 30.0 | 15.0 | ... | 28.7 | 34.9 | 0 | 0 | -31.4997 | 145.8319 | 2009 | 1 | 4 | South |
| 3 | 21.9 | 38.4 | 0.0 | 11.4 | WNW | 31.0 | WNW | WSW | 6.0 | 6.0 | ... | 29.1 | 35.6 | 0 | 0 | -31.4997 | 145.8319 | 2009 | 1 | 5 | South |
| 4 | 24.2 | 41.0 | 0.0 | 11.2 | WNW | 35.0 | NW | WNW | 17.0 | 13.0 | ... | 33.6 | 37.6 | 0 | 0 | -31.4997 | 145.8319 | 2009 | 1 | 6 | South |
5 rows × 26 columns
print(list(df.columns))
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow', 'lat', 'lng', 'Year', 'Month', 'Day', 'Location']
To simplify analysis, we focus on the month instead of the day and year as a time feature; computational resource constraints also prompt us to label Locations as regions, rather than complexifying this further with individual city labels and considering both latitude and longitude.
features = list(df.columns)
# Separating out the features - trying to predict if it will rain tomorrow given our features
y = df['RainTomorrow']
print(y)
df = df.drop(columns=['Year', 'Day', 'RainTomorrow', 'lat', 'lng'])
print(df.shape[1])
print(len(y)-df.shape[0])
0 0
1 0
2 0
3 0
4 0
..
116903 0
116904 0
116905 0
116906 0
116907 0
Name: RainTomorrow, Length: 116908, dtype: int64
21
0
# Define categorical columns
categorical = ['RainToday', 'Month', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
print(f"Categorical columns are: {categorical}")
# Define numerical columns
numerical = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustSpeed',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
print(f"Numerical columns are: {numerical}")
Categorical columns are: ['RainToday', 'Month', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'] Numerical columns are: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
One-hot encoding of categorical variables (Raintoday already encoded since it is a binary variable)
one_hot_month = pd.get_dummies(df['Month'], prefix='Month')
df = df.join(one_hot_month).drop(columns='Month')
one_hot_location = pd.get_dummies(df['Location'], prefix='Location')
df = df.join(one_hot_location).drop(columns='Location')
one_hot_windgustdir = pd.get_dummies(df['WindGustDir'], prefix='WindGustDir')
df = df.join(one_hot_windgustdir).drop(columns='WindGustDir')
one_hot_winddir9am = pd.get_dummies(df['WindDir9am'], prefix='WindDir9am')
df = df.join(one_hot_winddir9am).drop(columns='WindDir9am')
one_hot_winddir3pm = pd.get_dummies(df['WindDir3pm'], prefix='WindDir3pm')
df = df.join(one_hot_winddir3pm).drop(columns='WindDir3pm')
df
| MinTemp | MaxTemp | Rainfall | Evaporation | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | ... | WindDir3pm_NNW | WindDir3pm_NW | WindDir3pm_S | WindDir3pm_SE | WindDir3pm_SSE | WindDir3pm_SSW | WindDir3pm_SW | WindDir3pm_W | WindDir3pm_WNW | WindDir3pm_WSW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17.9 | 35.2 | 0.0 | 12.0 | 48.0 | 6.0 | 20.0 | 20.0 | 13.0 | 1006.3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 18.4 | 28.9 | 0.0 | 14.8 | 37.0 | 19.0 | 19.0 | 30.0 | 8.0 | 1012.9 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2 | 19.4 | 37.6 | 0.0 | 10.8 | 46.0 | 30.0 | 15.0 | 42.0 | 22.0 | 1012.3 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 21.9 | 38.4 | 0.0 | 11.4 | 31.0 | 6.0 | 6.0 | 37.0 | 22.0 | 1012.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 24.2 | 41.0 | 0.0 | 11.2 | 35.0 | 17.0 | 13.0 | 19.0 | 15.0 | 1010.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 116903 | 3.5 | 21.8 | 0.0 | 10.0 | 31.0 | 15.0 | 13.0 | 59.0 | 27.0 | 1024.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 116904 | 2.8 | 23.4 | 0.0 | 10.0 | 31.0 | 13.0 | 11.0 | 51.0 | 24.0 | 1024.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 116905 | 3.6 | 25.3 | 0.0 | 10.0 | 22.0 | 13.0 | 9.0 | 56.0 | 21.0 | 1023.5 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 116906 | 5.4 | 26.9 | 0.0 | 10.0 | 37.0 | 9.0 | 9.0 | 53.0 | 24.0 | 1021.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 116907 | 7.8 | 27.0 | 0.0 | 10.0 | 28.0 | 13.0 | 7.0 | 51.0 | 24.0 | 1019.4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
116908 rows × 80 columns
Standardization (z-score) of numerical variables
num_variables = df[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustSpeed',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']]
df[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustSpeed',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']] = (num_variables - num_variables.mean())/num_variables.std()
x = df
x
| MinTemp | MaxTemp | Rainfall | Evaporation | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | ... | WindDir3pm_NNW | WindDir3pm_NW | WindDir3pm_S | WindDir3pm_SE | WindDir3pm_SSE | WindDir3pm_SSW | WindDir3pm_SW | WindDir3pm_W | WindDir3pm_WNW | WindDir3pm_WSW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.856963 | 1.647732 | -0.276594 | 0.982573 | 0.528915 | -1.112858 | 0.071347 | -2.483325 | -1.804439 | -1.642333 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 0.935645 | 0.765747 | -0.276594 | 1.481582 | -0.295102 | 0.449959 | -0.045120 | -1.959900 | -2.042651 | -0.666689 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1.093010 | 1.983727 | -0.276594 | 0.768713 | 0.379093 | 1.772343 | -0.510986 | -1.331789 | -1.375658 | -0.755384 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1.486423 | 2.095725 | -0.276594 | 0.875643 | -0.744566 | -1.112858 | -1.559185 | -1.593502 | -1.375658 | -0.696254 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 1.848363 | 2.459719 | -0.276594 | 0.840000 | -0.444923 | 0.209526 | -0.743919 | -2.535668 | -1.709154 | -0.991904 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 116903 | -1.409096 | -0.228237 | -0.276594 | 0.626139 | -0.744566 | -0.030908 | -0.743919 | -0.441966 | -1.137446 | 1.077645 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 116904 | -1.519251 | -0.004241 | -0.276594 | 0.626139 | -0.744566 | -0.271341 | -0.976852 | -0.860706 | -1.280373 | 1.062862 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 116905 | -1.393359 | 0.261755 | -0.276594 | 0.626139 | -1.418761 | -0.271341 | -1.209785 | -0.598994 | -1.423300 | 0.900255 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 116906 | -1.110102 | 0.485751 | -0.276594 | 0.626139 | -0.295102 | -0.752208 | -1.209785 | -0.756021 | -1.280373 | 0.530693 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 116907 | -0.732426 | 0.499751 | -0.276594 | 0.626139 | -0.969297 | -0.271341 | -1.442718 | -0.860706 | -1.280373 | 0.294173 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
116908 rows × 80 columns
x.columns
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustSpeed',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday', 'Month_1', 'Month_2', 'Month_3', 'Month_4',
'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10',
'Month_11', 'Month_12', 'Location_East', 'Location_North',
'Location_South', 'Location_West', 'WindGustDir_E', 'WindGustDir_ENE',
'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_NE', 'WindGustDir_NNE',
'WindGustDir_NNW', 'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE',
'WindGustDir_SSE', 'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W',
'WindGustDir_WNW', 'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE',
'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE',
'WindDir9am_NNW', 'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE',
'WindDir9am_SSE', 'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W',
'WindDir9am_WNW', 'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE',
'WindDir3pm_ESE', 'WindDir3pm_N', 'WindDir3pm_NE', 'WindDir3pm_NNE',
'WindDir3pm_NNW', 'WindDir3pm_NW', 'WindDir3pm_S', 'WindDir3pm_SE',
'WindDir3pm_SSE', 'WindDir3pm_SSW', 'WindDir3pm_SW', 'WindDir3pm_W',
'WindDir3pm_WNW', 'WindDir3pm_WSW'],
dtype='object')
We stratify data with respect to the prescence of rain tomorrow as then otherwise most of the data splits would indicate no rain (as these are in overwhelming abundance in the data; it doesn't rain much in Australia, fancy that).
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
param_grid_lr = [
{'penalty': ['l1', 'l2'],
'C': [0.2, 1, 5]}
]
param_grid_rf = [
{'criterion': ['gini', 'entropy'],
'max_depth': [16, 20, 50],
'n_estimators': [150, 200]}
]
param_grid_xgb = {
'max_depth': [12],
'learning_rate': [0.2],
'n_estimators': [300]
}
lr_cl = LogisticRegression(max_iter=500, random_state=42, solver='liblinear')
rf_cl = RandomForestClassifier(random_state=42)
xgb_cl = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='map', random_state=42)
# Construct grid searches - sklearn can only use CPU
kfolds = 7
LR = GridSearchCV(estimator=lr_cl,
param_grid=param_grid_lr,
scoring='average_precision',
cv=kfolds,
n_jobs=-1,
verbose=1)
RF = GridSearchCV(estimator=rf_cl,
param_grid=param_grid_rf,
scoring='average_precision',
cv=kfolds,
n_jobs=-1,
verbose=1)
XGB = GridSearchCV(estimator=xgb_cl,
param_grid=param_grid_xgb,
scoring='average_precision',
cv=kfolds,
n_jobs=1,
verbose=1,
error_score='raise')
# List of pipelines for iteration
grids = [RF, LR, XGB]
# Creating a dict for our reference
grid_dict = {0: 'Random Forest',
1: 'Logistic Regression',
2: 'Extreme Gradient Boosting'}
best_precision = 0.0
for idx, gs in enumerate(grids):
print('\nEstimator: %s' % grid_dict[idx])
# Fit model to data with GridSearch candidate params
gs.fit(x_train, y_train)
# Collect CV results for plotting
if idx == 0:
cv_RF = gs.cv_results_
if idx == 1:
cv_LR = gs.cv_results_
print('Best params are : %s' % gs.best_params_)
# Best training data average precision
print('Best training average precision: %.4f' % gs.best_score_)
# Predict on test data with best params
y_pred = gs.predict(x_test)
y_pred_prob = gs.predict_proba(x_test)
# Test metrics of model with best params
print('Test accuracy score for best params: %.4f ' % accuracy_score(y_test, y_pred))
print('Test recall score for best params: %.4f ' % recall_score(y_test, y_pred))
print('Test precision score for best params: %.4f ' % precision_score(y_test, y_pred))
print('Test average precision score for best params: %.4f ' % average_precision_score(y_test, y_pred_prob[:, 1]))
print('Test ROC AUC for best params: %.4f ' % roc_auc_score(y_test, y_pred_prob[:, 1]))
# Track best model
if average_precision_score(y_test, y_pred_prob[:, 1]) > best_precision:
best_precision = average_precision_score(y_test, y_pred_prob[:, 1])
best_clf = gs.best_estimator_
best_clf_idx = idx
print(f'\nClassifier with best test average precision: {grid_dict[best_clf_idx]}')
print(best_clf)
print(f'Test average precision: {best_precision}')
Estimator: Random Forest
Fitting 7 folds for each of 12 candidates, totalling 84 fits
Best params are : {'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 200}
Best training average precision: 0.7415
Test accuracy score for best params: 0.8618
Test recall score for best params: 0.5124
Test precision score for best params: 0.7890
Test average precision score for best params: 0.7463
Test ROC AUC for best params: 0.8935
Estimator: Logistic Regression
Fitting 7 folds for each of 6 candidates, totalling 42 fits
Best params are : {'C': 0.2, 'penalty': 'l1'}
Best training average precision: 0.7013
Test accuracy score for best params: 0.8515
Test recall score for best params: 0.5155
Test precision score for best params: 0.7343
Test average precision score for best params: 0.7037
Test ROC AUC for best params: 0.8739
Estimator: Extreme Gradient Boosting
Fitting 7 folds for each of 1 candidates, totalling 7 fits
Best params are : {'learning_rate': 0.2, 'max_depth': 12, 'n_estimators': 300}
Best training average precision: 0.7500
Test accuracy score for best params: 0.8638
Test recall score for best params: 0.5752
Test precision score for best params: 0.7511
Test average precision score for best params: 0.7549
Test ROC AUC for best params: 0.8954
Classifier with best test average precision: Extreme Gradient Boosting
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='map',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.2, max_delta_step=0,
max_depth=12, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=300, n_jobs=8,
num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
use_label_encoder=False, validate_parameters=1, verbosity=None)
Test average precision: 0.7548898321880317
RocCurveDisplay.from_estimator(best_clf, x_test, y_test)
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x14e6dda60>
Bar plot of top 20 feature importances
sorted_feature_import_ind = best_clf.feature_importances_.argsort()[::-1]
plt.barh(np.array(x.columns)[sorted_feature_import_ind][:20][::-1],
best_clf.feature_importances_[sorted_feature_import_ind][:20][::-1])
<BarContainer object of 20 artists>
We can visualize the performance of our architectures that had hyperparameters tuned through GridSearchCV (logistic regression and random forest) through a simple plot below. For logistic regression, we see that the choice of parameter candidate values that we chose do not greatly impact the predictive performance of the model overall.
results_df_LR = pd.DataFrame(cv_LR)
results_df_LR = results_df_LR.sort_values(by=["rank_test_score"])
results_df_LR = results_df_LR.set_index(
results_df_LR["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
).rename_axis("LR Model")
results_df_LR[["params", "rank_test_score", "mean_test_score", "std_test_score"]]
| params | rank_test_score | mean_test_score | std_test_score | |
|---|---|---|---|---|
| LR Model | ||||
| 0.2_l1 | {'C': 0.2, 'penalty': 'l1'} | 1 | 0.701250 | 0.005877 |
| 0.2_l2 | {'C': 0.2, 'penalty': 'l2'} | 2 | 0.701206 | 0.005953 |
| 1_l1 | {'C': 1, 'penalty': 'l1'} | 3 | 0.701198 | 0.005916 |
| 1_l2 | {'C': 1, 'penalty': 'l2'} | 4 | 0.701191 | 0.005936 |
| 5_l1 | {'C': 5, 'penalty': 'l1'} | 5 | 0.701188 | 0.005930 |
| 5_l2 | {'C': 5, 'penalty': 'l2'} | 6 | 0.701180 | 0.005933 |
We also see how our CV results shaped up over time. For logistic regression, all model parameter combinations perform generally "the same" over time. I say this in quotes as statistical tests (such as a corrected t-test) need to be performed to ascertain whether two models are statistically different. Note that the performance of the model highly depends on the fold.
# create df of model scores ordered by performance
model_scores_LR = results_df_LR.filter(regex=r"split\d*_test_score")
# For k folds, plot of dependency between CV fold and AUC score
fig, ax = plt.subplots()
g = sns.lineplot(
data=model_scores_LR.transpose().iloc[:kfolds+1],
dashes=False,
marker="o",
alpha=0.5,
ax=ax,
palette="Set1"
)
ax.set_xlabel("CV Test Fold", size=12, labelpad=10)
ax.set_ylabel("Model AUC", size=12)
ax.tick_params(bottom=True, labelbottom=False)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
g.set_title("Architecture Performance Over Folds - Logistic Regression")
plt.show()
Since all models are evaluated on the same k partitions without any randomization of the data thereafter (as opposed to setting the cv parameter in GridSearch to, say, RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)) we expect covariance between models -- made significant observing that the model was not sensitive to the candidate parameter combinations. The decision was made not to use this repeated stratified K-Fold with different randomization in each repetition due to the availability of machines capable of handling such computations at the homes of the authors.
However, only by further testing (such as by conducting a corrected pairs t-test) would allow us to ascertain if these models are statistically different (or not).
# Correlation of AUC scores across folds
corr_LR = pd.DataFrame(model_scores_LR.transpose()).corr()
corr_LR
| LR Model | 0.2_l1 | 0.2_l2 | 1_l1 | 1_l2 | 5_l1 | 5_l2 |
|---|---|---|---|---|---|---|
| LR Model | ||||||
| 0.2_l1 | 1.000000 | 0.999905 | 0.999934 | 0.999923 | 0.999933 | 0.999934 |
| 0.2_l2 | 0.999905 | 1.000000 | 0.999950 | 0.999972 | 0.999963 | 0.999962 |
| 1_l1 | 0.999934 | 0.999950 | 1.000000 | 0.999995 | 0.999997 | 0.999999 |
| 1_l2 | 0.999923 | 0.999972 | 0.999995 | 1.000000 | 0.999998 | 0.999998 |
| 5_l1 | 0.999933 | 0.999963 | 0.999997 | 0.999998 | 1.000000 | 1.000000 |
| 5_l2 | 0.999934 | 0.999962 | 0.999999 | 0.999998 | 1.000000 | 1.000000 |
As for randomized forestry, we observe that different combinations of paramaters led to a greater variety of mean test scores. We would have to determine if these models were statistically different with further testing.
results_df_RF = pd.DataFrame(cv_RF)
results_df_RF = results_df_RF.sort_values(by=["rank_test_score"])
results_df_RF = results_df_RF.set_index(
results_df_RF["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
).rename_axis("RF_Model")
results_df_RF[["params", "rank_test_score", "mean_test_score", "std_test_score"]]
| params | rank_test_score | mean_test_score | std_test_score | |
|---|---|---|---|---|
| RF_Model | ||||
| entropy_50_200 | {'criterion': 'entropy', 'max_depth': 50, 'n_e... | 1 | 0.741511 | 0.004191 |
| entropy_50_150 | {'criterion': 'entropy', 'max_depth': 50, 'n_e... | 2 | 0.739785 | 0.004977 |
| entropy_20_200 | {'criterion': 'entropy', 'max_depth': 20, 'n_e... | 3 | 0.739513 | 0.004542 |
| entropy_20_150 | {'criterion': 'entropy', 'max_depth': 20, 'n_e... | 4 | 0.738767 | 0.004501 |
| gini_50_200 | {'criterion': 'gini', 'max_depth': 50, 'n_esti... | 5 | 0.738479 | 0.003924 |
| gini_20_200 | {'criterion': 'gini', 'max_depth': 20, 'n_esti... | 6 | 0.736867 | 0.003906 |
| gini_50_150 | {'criterion': 'gini', 'max_depth': 50, 'n_esti... | 7 | 0.736552 | 0.004352 |
| gini_20_150 | {'criterion': 'gini', 'max_depth': 20, 'n_esti... | 8 | 0.735648 | 0.003751 |
| entropy_16_200 | {'criterion': 'entropy', 'max_depth': 16, 'n_e... | 9 | 0.733332 | 0.004343 |
| entropy_16_150 | {'criterion': 'entropy', 'max_depth': 16, 'n_e... | 10 | 0.732735 | 0.004467 |
| gini_16_200 | {'criterion': 'gini', 'max_depth': 16, 'n_esti... | 11 | 0.731681 | 0.005084 |
| gini_16_150 | {'criterion': 'gini', 'max_depth': 16, 'n_esti... | 12 | 0.730933 | 0.005388 |
We observe that the random forest models with the entropy criterion generally performed better over the gini variety. A weak argument could be made that, given the criterion, increasing the number of trees in the forest and the maximum depth of a tree yields more performant models under AUC.
# create df of model scores ordered by performance
model_scores_RF = results_df_RF.filter(regex=r"split\d*_test_score")
# For k folds, plot of dependency between CV fold and AUC score
fig, ax = plt.subplots()
p = sns.lineplot(
data=model_scores_RF.transpose().iloc[:kfolds+1],
dashes=False,
marker="o",
alpha=0.5,
ax=ax,
palette="viridis"
)
ax.set_xlabel("CV Test Fold", size=12, labelpad=10)
ax.set_ylabel("Model AUC", size=12)
ax.tick_params(bottom=True, labelbottom=False)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
p.set_title("Architecture Performance Over Folds - Random Forest")
plt.show()